# set seed
set.seed(100)
# create initial split
splitted <- initial_split(data_clean, prop = 0.8, strata = "attrition")
# quick check
splitted#> <1177/293/1470>
# define preprocess recipe from train dataset
rec <- recipe(attrition ~ ., data = training(splitted)) %>%
step_rm(employee_count, employee_number) %>%
step_nzv(all_predictors()) %>%
step_string2factor(all_nominal(), -attrition) %>%
step_string2factor(attrition, levels = c("yes", "no")) %>%
step_downsample(attrition, ratio = 1/1, seed = 100) %>%
step_center(all_numeric()) %>%
step_scale(all_numeric()) %>%
prep(strings_as_factors = FALSE)
# get train and test dataset
data_train <- juice(rec)
data_test <- bake(rec, testing(splitted))
# quick check
head(juice(rec), 10)# define model specification
model_spec <- rand_forest(
mode = "classification",
mtry = 29,
trees = 1000,
min_n = 15
)
# define model engine
model_spec <- set_engine(
object = model_spec,
engine = "ranger",
seed = 100,
num.threads = parallel::detectCores() / 2,
importance = "impurity"
)
# quick check
model_spec#> Random Forest Model Specification (classification)
#>
#> Main Arguments:
#> mtry = 29
#> trees = 1000
#> min_n = 15
#>
#> Engine-Specific Arguments:
#> seed = 100
#> num.threads = parallel::detectCores()/2
#> importance = impurity
#>
#> Computational engine: ranger
# fit the model
model <- fit_xy(
object = model_spec,
x = select(data_train, -attrition),
y = select(data_train, attrition)
)
# quick check
model#> parsnip model object
#>
#> Ranger result
#>
#> Call:
#> ranger::ranger(formula = formula, data = data, mtry = ~29, num.trees = ~1000, min.node.size = ~15, seed = ~100, num.threads = ~parallel::detectCores()/2, importance = ~"impurity", verbose = FALSE, probability = TRUE)
#>
#> Type: Probability estimation
#> Number of trees: 1000
#> Sample size: 380
#> Number of independent variables: 30
#> Mtry: 29
#> Target node size: 15
#> Variable importance mode: impurity
#> Splitrule: gini
#> OOB prediction error (Brier s.): 0.1940542
# get variable importance
var_imp <- tidy(model$fit$variable.importance)
# tidying
var_imp <- var_imp %>%
head(10) %>%
rename(variable = names, importance = x) %>%
mutate(variable = reorder(variable, importance))
# variable importance plot
ggplot(var_imp, aes(x = variable, y = importance)) +
geom_col(fill = "darkblue") +
coord_flip() +
labs(title = "Variables Importance (Top 10)", x = NULL, y = NULL, fill = NULL) +
scale_y_continuous(expand = expand_scale(mult = c(0, 0.1))) +
theme_minimal()# predict on test
pred_test <- select(data_test, attrition) %>%
bind_cols(predict(model, select(data_test, -attrition))) %>%
bind_cols(predict(model, select(data_test, -attrition), type = "prob"))
# quick check
head(pred_test, 10)# metrics summary
pred_test %>%
summarise(
accuracy = accuracy_vec(attrition, .pred_class),
sensitivity = sens_vec(attrition, .pred_class),
specificity = spec_vec(attrition, .pred_class),
precision = precision_vec(attrition, .pred_class)
)# get roc curve data on test dataset
pred_test_roc <- pred_test %>%
roc_curve(attrition, .pred_yes)
# tidying
pred_test_roc <- pred_test_roc %>%
mutate_if(~ is.numeric(.), ~ round(., 4)) %>%
gather(metric, value, -.threshold)
# plot sensitivity-specificity trade-off
p <- ggplot(pred_test_roc, aes(x = .threshold, y = value)) +
geom_line(aes(colour = metric)) +
labs(x = "Probability Threshold to be Classified as Positive", y = "Value", colour = "Metrics") +
theme_minimal()
ggplotly(p)# get pr curve data on test dataset
pred_test_pr <- pred_test %>%
pr_curve(attrition, .pred_yes)
# tidying
pred_test_pr <- pred_test_pr %>%
mutate_if(~ is.numeric(.), ~ round(., 4)) %>%
gather(metric, value, -.threshold)
# plot recall-precision trade-off
p <- ggplot(pred_test_pr, aes(x = .threshold, y = value)) +
geom_line(aes(colour = metric)) +
labs(x = "Probability Threshold to be Classified as Positive", y = "Value", colour = "Metrics") +
theme_minimal()
ggplotly(p)